Analysis of Employee Retention by DDSAnalytics
In this report, we are trying to predict employee retention based on a number of features.
# load in the libraries
library(class)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(e1071)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(corrplot)
## corrplot 0.84 loaded
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
library(naniar)
library(gridExtra)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ tibble 2.1.3 ✓ dplyr 0.8.3
## ✓ tidyr 1.0.0 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.4.0
## ✓ purrr 0.3.3
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::combine() masks gridExtra::combine()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x purrr::lift() masks caret::lift()
attr = read_csv(
"./data/CaseStudy2-data.csv",
col_types = cols(
Attrition = col_character(),
BusinessTravel = col_factor(),
Department = col_factor(),
EducationField = col_factor(),
Gender = col_factor(),
JobRole = col_factor(),
MaritalStatus = col_factor(),
OverTime = col_factor(),
Age = col_integer(),
DailyRate = col_integer(),
DistanceFromHome = col_integer(),
EmployeeNumber = col_integer(),
HourlyRate = col_integer(),
MonthlyIncome = col_integer(),
MonthlyRate = col_integer(),
NumCompaniesWorked = col_integer(),
PercentSalaryHike = col_integer(),
TotalWorkingYears = col_integer(),
TrainingTimesLastYear = col_integer(),
YearsAtCompany = col_integer(),
YearsInCurrentRole = col_integer(),
YearsSinceLastPromotion = col_integer(),
YearsWithCurrManager = col_integer(),
Education = col_integer(),
EnvironmentSatisfaction = col_integer(),
JobInvolvement = col_integer(),
JobLevel = col_integer(),
JobSatisfaction = col_integer(),
PerformanceRating = col_integer(),
RelationshipSatisfaction = col_integer(),
StockOptionLevel = col_integer(),
WorkLifeBalance = col_integer()
)
)
attr = attr %>% select(-one_of( c("ID", "EmployeeCount", "Over18", "StandardHours")))
We see that there are no missing values.
gg_miss_var(attr)

continuous_features = attr %>% select(
Age,
DailyRate,
DistanceFromHome,
EmployeeNumber,
HourlyRate,
MonthlyIncome,
MonthlyRate,
NumCompaniesWorked,
PercentSalaryHike,
TotalWorkingYears,
TrainingTimesLastYear,
YearsAtCompany,
YearsInCurrentRole,
YearsSinceLastPromotion,
YearsWithCurrManager,
Education,
EnvironmentSatisfaction,
JobInvolvement,
JobLevel,
JobSatisfaction,
PerformanceRating,
RelationshipSatisfaction,
StockOptionLevel,
WorkLifeBalance
)
factors = attr %>% select(
BusinessTravel,
Department,
EducationField,
Gender,
JobRole,
MaritalStatus,
OverTime
)
Breaking down the years features, we do see there is some realation between these features and attrition. This relationship is easier to see on a log scale.
attr$LogYearsAtCompany = log(attr$YearsAtCompany + 1)
attr$LogYearsInCurrentRole = log(attr$YearsInCurrentRole + 1)
attr$LogYearsSinceLastPromotion = log(attr$YearsSinceLastPromotion + 1)
attr$LogYearsWithCurrManager = log(attr$YearsWithCurrManager + 1)
feat2 = attr %>% select(Attrition, LogYearsAtCompany, LogYearsInCurrentRole, LogYearsSinceLastPromotion, LogYearsWithCurrManager)
ggpairs(feat2, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attr$YearsBeforeCompany = attr$TotalWorkingYears - attr$YearsAtCompany
attr$LogTotalWorkingYears = log(attr$TotalWorkingYears + 1)
feat3 = attr %>% select(Attrition, Age, MonthlyIncome, YearsBeforeCompany, JobLevel)
ggpairs(feat3, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Age and income also look quite important when looking at attrition
attr$LogMonthlyIncome = log(attr$MonthlyIncome + 1)
attr$LogYearsBeforeCompany= log(attr$YearsBeforeCompany + 1)
feat3 = attr %>% select(Attrition, Age, LogMonthlyIncome, LogYearsBeforeCompany, JobLevel)
ggpairs(feat3, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

feat4 = attr %>% select(Attrition, PercentSalaryHike, PerformanceRating, NumCompaniesWorked, Education, YearsBeforeCompany)
ggpairs(feat4, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

attr$LogNumCompaniesWorked = log(attr$NumCompaniesWorked + 1)
attr$LogYearsBeforeCompany = log(attr$YearsBeforeCompany + 1)
feat5 = attr %>% select(Attrition, PercentSalaryHike, PerformanceRating, LogNumCompaniesWorked, Education, LogYearsBeforeCompany)
ggpairs(feat5, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Stock level and work/life balalance also look like they could add some influnece.
feat7 = attr %>% select(Attrition, EnvironmentSatisfaction, JobInvolvement, RelationshipSatisfaction, JobSatisfaction)
ggpairs(feat7, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

We can see if taking a single measure accross satiscaction field ties the inforation together
attr$pressurePoint = pmin(
attr$JobSatisfaction,
attr$EnvironmentSatisfaction,
attr$RelationshipSatisfaction
)
attr$TotalSatisfaction = pmax(
attr$JobSatisfaction,
attr$EnvironmentSatisfaction,
attr$RelationshipSatisfaction
)
attr$pressurePointFull = pmin(
attr$JobSatisfaction,
attr$EnvironmentSatisfaction,
attr$RelationshipSatisfaction,
attr$JobInvolvement,
attr$WorkLifeBalance
)
attr$TotalSatisfactionFull = pmax(
attr$JobSatisfaction,
attr$EnvironmentSatisfaction,
attr$RelationshipSatisfaction,
attr$JobInvolvement,
attr$WorkLifeBalance
)
feat7 = attr %>% select(Attrition, pressurePoint, TotalSatisfaction, pressurePointFull, TotalSatisfactionFull)
ggpairs(feat7, mapping=ggplot2::aes(color = Attrition), progress = FALSE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

cat_feats = attr %>% select(
Attrition,
BusinessTravel,
Department,
EducationField,
Gender,
JobRole,
MaritalStatus,
OverTime
)
ggplot(data = cat_feats, mapping = aes(x=JobRole, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

tr = ggplot(data = cat_feats, mapping = aes(x=BusinessTravel, fill = Attrition)) + geom_bar(position="fill")
dep = ggplot(data = cat_feats, mapping = aes(x=Department, fill = Attrition)) + geom_bar(position="fill")
ge = ggplot(data = cat_feats, mapping = aes(x=Gender, fill = Attrition)) + geom_bar(position="fill")
ma = ggplot(data = cat_feats, mapping = aes(x=MaritalStatus, fill = Attrition)) + geom_bar(position="fill")
ov = ggplot(data = cat_feats, mapping = aes(x=OverTime, fill = Attrition)) + geom_bar(position="fill")
grid.arrange(tr, dep, ge, ma, ov)

ggplot(data = cat_feats, mapping = aes(x=EducationField, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

ggplot(data = attr, mapping = aes(x=JobRole, y=LogMonthlyIncome)) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

It is interesting to note that the job levels with the highest salary also have the lowest job satisfcation.
ggplot(data = attr, mapping = aes(x=JobRole, y= JobSatisfaction )) + geom_boxplot() + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Stock option has more of a bathtub shaped attrition curve.
ggplot(data = attr, mapping = aes(x=StockOptionLevel, fill = Attrition)) + geom_bar(position="fill") + theme(axis.text.x = element_text(angle = 45, hjust = 1))

Drop the columns that are are not useful or those that have been replaced by their log version
drop_cols = c(
'YearsAtCompany',
'YearsInCurrentRole',
'LogYearsSinceLastPromotion',
'YearsWithCurrManager',
'YearsBeforeCompany',
'TotalWorkingYears',
'NumCompaniesWorked',
'MonthlyIncome',
'DistanceFromHome',
'Gender',
'EmployeeNumber'
)
attr = attr %>% select(-one_of(drop_cols))
attr$Attrition <- attr$Attrition %>% recode('Yes'=1,'No'= 0)
attr$BusinessTravel <- attr$BusinessTravel %>% recode('Travel_Frequently'=2,'Travel_Rarely'=1,'Non-Travel'= 0)
attr$Department <- attr$Department %>% recode('Sales'=2,'Human Resources'=1,'Research & Development'= 0)
attr$EducationField <- attr$EducationField %>% recode('Medical'=0,'Life Sciences'=1,'Other'=2, 'Marketing'=3, 'Technical Degree'=4, 'Human Resources'=5)
attr$OverTime <- attr$OverTime %>% recode('Yes'=1,'No'= 0)
attr$MaritalStatus <- attr$MaritalStatus %>% recode('Divorced'=2,'Married'=1,'Single'= 0)
attr$JobRole <- attr$JobRole %>% recode(
'Sales Representative'=0,
'Human Resources'=1,
'Laboratory Technician'=2,
'Sales Executive'=3,
'Research Scientist'=4,
'Healthcare Representative'=5,
'Manager'=6,
'Research Director'=7,
'Manufacturing Director'=7
)
Taking a subset of the features produces a mch better model.
set.seed(742)
exp_features = attr %>% select(
Attrition,
JobSatisfaction,
MaritalStatus,
JobRole,
StockOptionLevel,
LogMonthlyIncome,
LogYearsInCurrentRole,
)
splitPerc = .70
trainIndices = sample(1:dim(exp_features)[1],round(splitPerc * dim(exp_features)[1]))
train = exp_features[trainIndices,]
test = exp_features[-trainIndices,]
train_labels = train$Attrition
test_labels = test$Attrition
test = test %>% select(-Attrition)
train = train %>% select(-Attrition)
accs = data.frame(sens = numeric(40), spec = numeric(40), k = numeric(40))
for(i in 1:40)
{
classifications = knn(train, test, train_labels, prob = TRUE, k = i)
table(test_labels, classifications)
CM = confusionMatrix(table(test_labels, classifications))
accs$sens[i] = CM$byClass['Sensitivity']
accs$spec[i] = CM$byClass['Specificity']
accs$k[i] = i
}
print(accs)
## sens spec k
## 1 0.8711111 0.3055556 1
## 2 0.8767123 0.3095238 2
## 3 0.8765432 0.5555556 3
## 4 0.8663968 0.5000000 4
## 5 0.8650794 0.6666667 5
## 6 0.8565737 0.4000000 6
## 7 0.8611111 0.5555556 7
## 8 0.8645418 0.6000000 8
## 9 0.8616601 0.6250000 9
## 10 0.8656126 0.7500000 10
## 11 0.8622047 0.7142857 11
## 12 0.8622047 0.7142857 12
## 13 0.8622047 0.7142857 13
## 14 0.8549020 0.5000000 14
## 15 0.8588235 0.6666667 15
## 16 0.8554688 0.6000000 16
## 17 0.8554688 0.6000000 17
## 18 0.8549020 0.5000000 18
## 19 0.8549020 0.5000000 19
## 20 0.8543307 0.4285714 20
## 21 0.8554688 0.6000000 21
## 22 0.8549020 0.5000000 22
## 23 0.8549020 0.5000000 23
## 24 0.8554688 0.6000000 24
## 25 0.8560311 0.7500000 25
## 26 0.8554688 0.6000000 26
## 27 0.8560311 0.7500000 27
## 28 0.8560311 0.7500000 28
## 29 0.8560311 0.7500000 29
## 30 0.8527132 0.6666667 30
## 31 0.8560311 0.7500000 31
## 32 0.8521401 0.5000000 32
## 33 0.8527132 0.6666667 33
## 34 0.8527132 0.6666667 34
## 35 0.8527132 0.6666667 35
## 36 0.8527132 0.6666667 36
## 37 0.8527132 0.6666667 37
## 38 0.8532819 1.0000000 38
## 39 0.8532819 1.0000000 39
## 40 0.8527132 0.6666667 40
comp_noattr = read_csv("./data/CaseStudy2CompSet No Attrition.csv",
col_types = cols(
JobRole = col_factor(),
MaritalStatus = col_factor(),
MonthlyIncome = col_integer(),
YearsInCurrentRole = col_integer(),
JobSatisfaction = col_integer(),
StockOptionLevel = col_integer()
)
)
comp_noattr$LogYearsInCurrentRole = log(comp_noattr$YearsInCurrentRole + 1)
comp_noattr$LogMonthlyIncome = log(comp_noattr$MonthlyIncome + 1)
comp_noattr$BusinessTravel <- comp_noattr$BusinessTravel %>% recode('Travel_Frequently'=2,'Travel_Rarely'=1,'Non-Travel'= 0)
comp_noattr$Department <- comp_noattr$Department %>% recode('Sales'=2,'Human Resources'=1,'Research & Development'= 0)
comp_noattr$EducationField <- comp_noattr$EducationField %>% recode('Medical'=0,'Life Sciences'=1,'Other'=2, 'Marketing'=3, 'Technical Degree'=4, 'Human Resources'=5)
comp_noattr$OverTime <- comp_noattr$OverTime %>% recode('Yes'=1,'No'= 0)
comp_noattr$MaritalStatus <- comp_noattr$MaritalStatus %>% recode('Divorced'=2,'Married'=1,'Single'= 0)
comp_noattr$JobRole <- comp_noattr$JobRole %>% recode(
'Sales Representative'=0,
'Human Resources'=1,
'Laboratory Technician'=2,
'Sales Executive'=3,
'Research Scientist'=4,
'Healthcare Representative'=5,
'Manager'=6,
'Research Director'=7,
'Manufacturing Director'=7
)
comp_noattr_model = comp_noattr %>% select(
JobSatisfaction,
MaritalStatus,
JobRole,
StockOptionLevel,
LogMonthlyIncome,
LogYearsInCurrentRole,
)
comp_classifications = knn(train, comp_noattr_model, train_labels, k = 10)
comp_noattr$Attribution = comp_classifications
comp_noattr = comp_noattr %>% select(ID, Attribution)
write_csv(comp_noattr, 'Case2PredictionsDhillon Attrition.csv')
income_feats = attr %>% mutate(EducationField = factor(EducationField), JobRole = factor(JobRole))
income_feats = income_feats %>% select(Age, JobRole, EducationField, LogTotalWorkingYears, LogMonthlyIncome)
splitPerc = .70
trainIndices = sample(1:dim(exp_features)[1],round(splitPerc * dim(exp_features)[1]))
train = income_feats[trainIndices,]
test = income_feats[-trainIndices,]
income_model <- lm(LogMonthlyIncome ~ Age + JobRole + LogTotalWorkingYears, data=train)
summary(income_model)
##
## Call:
## lm(formula = LogMonthlyIncome ~ Age + JobRole + LogTotalWorkingYears,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.7945 -0.2138 0.0181 0.2068 0.8461
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.3461889 0.0636052 115.497 <2e-16 ***
## Age -0.0005182 0.0017342 -0.299 0.765
## JobRole1 0.0464489 0.0943680 0.492 0.623
## JobRole2 -0.0714951 0.0558274 -1.281 0.201
## JobRole3 0.5598298 0.0571986 9.787 <2e-16 ***
## JobRole4 -0.0682678 0.0552925 -1.235 0.217
## JobRole5 0.5975436 0.0652909 9.152 <2e-16 ***
## JobRole6 1.2268091 0.0779754 15.733 <2e-16 ***
## JobRole7 0.8238575 0.0621791 13.250 <2e-16 ***
## LogTotalWorkingYears 0.3761713 0.0259274 14.509 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2951 on 599 degrees of freedom
## Multiple R-squared: 0.8072, Adjusted R-squared: 0.8043
## F-statistic: 278.7 on 9 and 599 DF, p-value: < 2.2e-16
comp_noIncome = read_csv("./data/CaseStudy2CompSet No Salary.csv",
col_types = cols(
JobRole = col_factor(),
TotalWorkingYears = col_integer(),
Age = col_integer()
)
)
comp_noIncome$JobRole <- comp_noIncome$JobRole %>% recode(
'Sales Representative'=0,
'Human Resources'=1,
'Laboratory Technician'=2,
'Sales Executive'=3,
'Research Scientist'=4,
'Healthcare Representative'=5,
'Manager'=6,
'Research Director'=7,
'Manufacturing Director'=7
)
comp_noIncome = comp_noIncome %>% mutate(JobRole = factor(JobRole))
comp_noIncome$LogTotalWorkingYears = log(comp_noIncome$TotalWorkingYears + 1)
comp_noIncome_model = comp_noIncome %>% select(Age, JobRole, LogTotalWorkingYears)
pred_salary = predict(income_model, comp_noIncome_model)
comp_noIncome$LogMonthlyIncome = pred_salary
comp_noIncome$MonthlyIncome = exp(comp_noIncome$LogMonthlyIncome) - 1
comp_noIncome = comp_noIncome %>% select(ID, MonthlyIncome)
write_csv(comp_noIncome, 'Case2PredictionsDhillon Salary.csv')